library(magrittr)
library(dplyr)
library(ggplot2)
library(cowplot)
library(ggrepel)
library(ggpubr)
library(scHelper)
library(biomaRt)
library(pheatmap)
library(SummarizedExperiment)
library(qs)
system("sudo apt-get update\nsudo apt-get install libnetcdf-dev -y")
library(DEP)
tt <- Sys.time()
Load data
metadata <- read.delim("/work/proteomics/02_data/Metadata_Cohort_100723.csv", sep = ";", dec = ",", header = T) %>%
mutate(intern = gsub("K", "", .$K.no.),
Diagnosis = gsub("Unknown", "MSA", .$Diagnosis)) # 1418 should be MSA
sample.meta <- read.delim("/work/proteomics/02_data/Kohorte_info.csv", sep = ";", header = T)
Plots
# Prep
stat.sex <- metadata %$% table(Diagnosis, Sex) %>% fisher.test()
stat.hemi <- metadata %$% table(Diagnosis, Hemisphere) %>% fisher.test()
comp <- list(c("CTRL","MSA"),
c("CTRL","PD"),
c("CTRL","PSP"),
c("MSA","PD"),
c("MSA","PSP"),
c("PD","PSP"))
# Plots
p1 <- metadata %>%
ggplot(aes(Diagnosis, fill = Diagnosis)) +
geom_bar(stat = "count") +
theme_bw() +
labs(x ="", y = "", title = "Samples") +
scale_y_continuous(breaks = seq(2, 12, 2)) +
scale_fill_manual(values = ggsci::pal_npg()(5))
p2 <- metadata %>%
ggplot(aes(Sex, fill = Diagnosis)) +
geom_bar(position = "dodge") +
theme_bw() +
labs(x ="", y = "", title = "Sex", subtitle = paste0(stat.sex$method," : ",round(stat.sex$p.value, digits = 4))) +
scale_y_continuous(breaks = seq(2, 10, 2)) +
scale_fill_manual(values = ggsci::pal_npg()(5))
p3 <- metadata %>%
ggplot(aes(Diagnosis, PMI, fill = Diagnosis)) +
geom_boxplot() +
theme_bw() +
labs(x ="", y = "", title = "PMI") +
scale_fill_manual(values = ggsci::pal_npg()(5)) +
stat_compare_means(method = "t.test", comparisons = comp)
p3 <- p3 +
stat_compare_means(method = "anova", label.y = layer_scales(p3, 1)$y$range$range[2])
p4 <- metadata %>%
ggplot(aes(Diagnosis, fill = Subtype)) +
geom_bar(stat = "count") +
theme_bw() +
labs(x ="", y = "", title = "Subtypes") +
scale_y_continuous(breaks = seq(2, 12, 2)) +
scale_fill_manual(values = ggsci::pal_npg()(10)[5:9])
p5 <- metadata %>%
ggplot(aes(Diagnosis, Age, fill = Diagnosis)) +
geom_boxplot() +
theme_bw() +
labs(x ="", y = "", title = "Age (uncorrected post-hoc test)") +
scale_fill_manual(values = ggsci::pal_npg()(5)) +
stat_compare_means(method = "t.test", comparisons = comp)
p5 <- p5 +
stat_compare_means(method = "anova", label.y = layer_scales(p5, 1)$y$range$range[2])
p6 <- metadata %>%
filter(Diagnosis != "CTRL", !is.na(Duration)) %>%
ggplot(aes(Diagnosis, Duration, fill = Diagnosis)) +
geom_boxplot() +
theme_bw() +
labs(x ="", y = "", title = "Disease duration") +
scale_y_continuous(breaks = seq(-20, 14, 1)) +
scale_fill_manual(values = ggsci::pal_npg()(5)[2:5])
p7 <- metadata %>%
ggplot(aes(Hemisphere, fill = Diagnosis)) +
geom_bar(position = "dodge") +
theme_bw() +
labs(x ="", y = "", title = "Hemisphere", subtitle = paste0(stat.hemi$method," : ",round(stat.hemi$p.value, digits = 4))) +
scale_y_continuous(breaks = seq(2, 10, 2)) +
scale_fill_manual(values = ggsci::pal_npg()(5))
# Output
plot_grid(plotlist = list(p1, p2, p3, p4, p5, p6, p7), ncol = 3)
# dat.raw <- read.delim("/work/proteomics/02_data/BREG-ALL-295f-42m-nXRN-a-sum-Top3-SP&IMGT-Cvar-lib_v18.0_Report_Protein_Quant_Pivot_.tsv", header = T) %>%
dat.raw <- read.delim("/work/proteomics/02_data/BREG-ALL-295f-nXRN-a-sum-Top3-SP&INRET-Cvar-lib_20.0_Report_Protein Quant (Pivot).tsv", header = T) %>%
.[-ncol(.)]
genes <- dat.raw$PG.ProteinGroups %>%
strsplit(";", T)
df <- colnames(dat.raw)[-1] %>%
setNames(., .) %>%
strsplit("m_") %>%
sget(2) %>%
strsplit(".", fixed = T) %>%
{data.frame(id = names(.),
experiment = sget(., 1),
sample = sget(., 2),
area = sget(., 3))} %>%
mutate(area = strsplit(area, "_") %>%
sget(1)) %>%
`rownames<-`(NULL) %>%
mutate(area = gsub("PUT", "STR", area))
# Old data
df.pfc <- df %>%
filter(experiment == "BREG18") %>%
mutate(area = "PFC")
sample.meta.pfc <- sample.meta %>%
filter(Compartment == "Brain (dmPFC)") %>%
mutate(Sample = gsub("-", "", Sample))
df.pfc %<>%
mutate(intern = sample.meta.pfc$Intern[match(df.pfc$sample, sample.meta.pfc$Sample)],
Diagnosis = sample.meta.pfc$Diagnosis[match(df.pfc$sample, sample.meta.pfc$Sample)])
# New data
df.known <- df %>%
filter(experiment == "BREG",
!grepl("RR", sample)) %>%
mutate(intern = sample,
area = gsub("CBX", "CER", area)) %>%
mutate(Diagnosis = metadata$Diagnosis[match(.$intern, metadata$intern)])
df.unknown <- df %>%
filter(experiment == "BREG",
grepl("RR", sample))
sample.meta.unknown <- sample.meta %>%
filter(Compartment != "Brain (dmPFC)") %>%
dplyr::select(Sample, Intern, Compartment, Diagnosis) %>%
mutate(Sample = gsub("-", "", Sample),
Intern = gsub("K", "", Intern) %>%
strsplit("-") %>%
sget(1))
df.unknown %<>%
mutate(intern = sample.meta.unknown$Intern[match(.$sample, sample.meta.unknown$Sample)],
area = gsub("CBX", "CER", area)) %>%
mutate(Diagnosis = metadata$Diagnosis[match(.$intern, metadata$intern)])
# Complete
# df.complete <- rbind(df.pfc, df.known, df.unknown)
df.complete <- rbind(df.known, df.unknown)
# Samples per donors per area
df.complete %$%
table(intern, area)
## area
## intern CER FC MED SN STR
## 1358 0 1 0 1 1
## 1370 1 1 1 1 1
## 1376 0 1 0 1 1
## 1379 1 1 1 1 1
## 1381 2 1 1 2 2
## 1385 1 1 1 1 1
## 1387 1 1 1 1 1
## 1388 1 1 1 1 1
## 1391 1 1 1 0 1
## 1392 2 0 1 2 1
## 1394 2 1 1 2 2
## 1397 1 1 1 1 1
## 1398 2 0 1 1 1
## 1400 1 1 1 1 1
## 1401 1 1 1 1 1
## 1406 1 1 1 1 1
## 1407 2 0 1 1 1
## 1410 1 1 1 1 1
## 1411 1 1 1 1 1
## 1412 1 1 0 1 1
## 1414 1 1 1 1 1
## 1418 1 0 0 0 0
## 1420 1 1 1 1 1
## 1421 1 1 0 0 1
## 1424 1 1 0 1 1
## 1429 1 1 0 1 1
## 1436 1 1 1 0 1
## 1440 1 1 1 1 1
## 1446 2 2 1 2 2
## 1449 1 1 0 1 1
## 1450 1 0 0 1 0
## 1453 1 1 1 1 1
## 1456 1 0 0 1 0
## 1457 1 1 1 1 1
## 1458 1 1 0 1 1
## 1461 1 1 0 1 1
## 1462 2 0 1 1 1
## 1464 2 1 1 2 2
## 1465 1 1 0 1 1
## 1466 1 1 1 1 1
## 1467 1 1 1 1 1
## 1469 1 1 1 1 1
## 1490 1 1 1 1 1
## 1492 1 1 1 1 1
## 1494 1 1 1 1 1
## 1495 1 1 1 1 1
# Samples per disease per area
df.complete %$%
table(Diagnosis, area)
## area
## Diagnosis CER FC MED SN STR
## CTRL 13 10 8 13 11
## MSA 14 6 9 9 10
## PD 11 12 5 12 12
## PSP 14 12 11 13 14
# Donors with more than three samples
keep <- df.complete$intern %>%
table() %>%
as.data.frame() %>%
filter(Freq > 3) %>%
pull(".")
# Samples that've been run twice
df.complete %$%
table(intern, area) %>%
as.data.frame() %>%
filter(Freq > 1) %>%
dplyr::arrange(intern)
## intern area Freq
## 1 1381 CER 2
## 2 1381 SN 2
## 3 1381 STR 2
## 4 1392 CER 2
## 5 1392 SN 2
## 6 1394 CER 2
## 7 1394 SN 2
## 8 1394 STR 2
## 9 1398 CER 2
## 10 1407 CER 2
## 11 1446 CER 2
## 12 1446 FC 2
## 13 1446 SN 2
## 14 1446 STR 2
## 15 1462 CER 2
## 16 1464 CER 2
## 17 1464 SN 2
## 18 1464 STR 2
# Overview of samples per donor
df.complete$intern %>%
table() %>%
as.data.frame() %>%
pull(Freq) %>%
table()
## .
## 1 2 3 4 5 6 8 9
## 1 2 3 9 26 1 3 1
# Final df, donors with more than one sample
df.final <- df.complete %>%
filter(intern %in% keep)
# Output for Jonas
# df.final %$%
# table(intern, area) %>%
# as.data.frame() %>%
# reshape2::dcast(intern ~ area, value.var = "Freq") %>%
# write.table("/work/proteomics/02_data/Final_cohort.tsv", sep = "\t", dec = ",", row.names = F)
dat <- cbind(dplyr::select(dat.raw, "PG.ProteinGroups"), dat.raw[, colnames(dat.raw) %in% df.final$id])
We’ll extract most prominent gene names, then reorder our data object.
# Remove PGs with no gene name
# dat$PG.ProteinGroups[sapply(genes, length) == 0]
dat <- dat[!sapply(genes, length) == 0, ] # Test if protein name exists
genes %<>% .[!sapply(., length) == 0] %>%
sget(1)
dat %<>%
`rownames<-`(genes %>% make.unique()) %>%
dplyr::select(!c("PG.ProteinGroups"))
# Remove samples from previous study - REINSTATE THIS LATER!
# dat %<>%
# .[, !grepl("BREG18", colnames(.))]
We’ll remove proteins missing in at least 70% of samples per region.
region <- df.final$area[match(colnames(dat), df.final$id)] %>%
setNames(df.final$id[match(colnames(dat), df.final$id)])
dat.bin <- (!is.na(dat)) * 1
genes.to.keep <- dat.bin %>%
t() %>%
data.frame() %>%
`colnames<-`(dat.bin %>% rownames()) %>%
split(region) %>%
sapply(\(brain.region) (colSums(brain.region) / nrow(brain.region)) %>% .[. > 0.7] %>% names()) %>%
Reduce(c, .) %>%
unique()
dat.filter <- dat[rownames(dat) %in% genes.to.keep, ]
# Set the Ensembl dataset and the corresponding Mart
ensembl_dataset <- "hsapiens_gene_ensembl"
ensembl_mart <- useMart("ensembl", dataset = ensembl_dataset)#, host = "https://useast.ensembl.org")
# Define the attributes to retrieve
attributes <- c("uniprotswissprot", "ensembl_gene_id", "hgnc_symbol")
# Query the Mart
mart_results <- getBM(attributes = attributes, filters = "uniprotswissprot", values = rownames(dat.filter), mart = ensembl_mart)
Create universe
mart_results_universe <- getBM(attributes = attributes, filters = "uniprotswissprot", values = rownames(dat), mart = ensembl_mart)
universe <- mart_results_universe[match(unique(mart_results_universe$uniprotswissprot), mart_results_universe$uniprotswissprot), ] %>% # Dropping multiple ensemblID/HGNC matches to protIDs
mutate(., hgnc_symbol = apply(., 1, \(x) if (x[3] == "") x[1] else x[3])) %>% # If no HGNC match, use protID
mutate(hgnc_symbol = make.unique(hgnc_symbol)) %>% # Make unique HGNC symbols for duplicates
pull(hgnc_symbol)
Transfer IDs
tmp <- mart_results[match(unique(mart_results$uniprotswissprot), mart_results$uniprotswissprot), ] %>% # Dropping multiple ensemblID/HGNC matches to protIDs
mutate(., hgnc_symbol = apply(., 1, \(x) if (x[3] == "") x[1] else x[3])) %>% # If no HGNC match, use protID
mutate(hgnc_symbol = make.unique(hgnc_symbol)) # Make unique HGNC symbols for duplicates
dat.order <- dat.filter %>%
.[match(tmp$uniprotswissprot, rownames(.)), ] %>%
`rownames<-`(tmp$hgnc_symbol)
This will also log transform data
For guide, see https://www.bioconductor.org/packages/devel/bioc/vignettes/DEP/inst/doc/DEP.html#generate-a-summarizedexperiment-object
expDesign <- df.final %>%
mutate(areaRep = makeunique::make_unique(area, wrap_in_brackets = F) %>% strsplit(" ") %>% sget(2),
diagRep = makeunique::make_unique(Diagnosis, wrap_in_brackets = F) %>% strsplit(" ") %>% sget(2),
areaDiagRep = paste(area, Diagnosis, sep = "-") %>% makeunique::make_unique(wrap_in_brackets = F) %>% strsplit(" ") %>% sget(2)) %>%
dplyr::rename(label = id,
condition = area,
replicate = areaRep)
dat.tmp <- dat.order
dat.tmp[is.na(dat.tmp)] <- 0
dat.tmp[dat.tmp == 0] <- NA
dat.se <- dat.tmp %>%
mutate(name = rownames(.),
ID = tmp$ensembl_gene_id[match(rownames(.), tmp$hgnc_symbol)]) %>%
DEP::make_se(seq(205), expDesign)
plot_frequency(dat.se)
plot_numbers(dat.se)
p <- plot_coverage(dat.se)
p$data$Var1 %<>% as.numeric()
p + scale_fill_gradient2(low = "white", mid = "navyblue", high = "firebrick", limits = c(120, 170), midpoint = 145)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
dat.norm <- normalize_vsn(dat.se)
# Extract assay data
dat.norm <- dat.se
# Perform quantile normalization
tmp.norm <- limma::normalizeBetweenArrays(assay(dat.norm), method = "quantile")
# median subtraction per væv, træk sample fra og læg væv (hvis vi kigger diagnoser)/totalt (hvis vi kigger på væv) til
assay(dat.norm) <- tmp.norm
Check for rank log2
message("Before norm")
## Before norm
dat.se %>%
assay() %>%
{setNames(rowMedians(., na.rm = T), rownames(.))} %>%
sort(decreasing = T) %>%
head(30)
## HBB GFAP HBA2 GAPDH ACTG1 PLP1 MBP PKM
## 28.15935 27.16085 27.12984 26.74479 26.66073 25.76940 25.26778 25.26047
## LDHB HBZ PEBP1 TUBB4B ATP1A3 ENO1 CALM1 TUBA1B
## 25.06428 25.05714 24.98948 24.92874 24.86718 24.83812 24.82533 24.77710
## ALDOA MDH1 YWHAB PPIA PRDX2 ALB UCHL1 HSPA8
## 24.73854 24.59701 24.54092 24.51431 24.45924 24.39913 24.36080 24.31425
## TPI1 PRDX1 ENO2 ALDOC S100B ATP5F1B
## 24.23935 24.19145 24.19025 24.14925 24.10355 24.09028
message("After norm")
## After norm
dat.norm %>%
assay() %>%
{setNames(rowMedians(., na.rm = T), rownames(.))} %>%
sort(decreasing = T) %>%
head(30)
## HBB GFAP HBA2 GAPDH ACTG1 PLP1 MBP PKM
## 28.36949 27.19211 27.14875 26.78561 26.61635 25.75122 25.32839 25.29523
## HBZ LDHB ATP1A3 PEBP1 TUBB4B CALM1 ENO1 TUBA1B
## 25.06263 25.05974 24.94624 24.94461 24.93815 24.85258 24.79301 24.75266
## ALDOA MDH1 YWHAB PPIA PRDX2 UCHL1 ALB HSPA8
## 24.72100 24.60810 24.57369 24.55891 24.45265 24.43164 24.41328 24.32569
## TPI1 ENO2 S100B PRDX1 ATP5F1B PGK1
## 24.22291 24.21066 24.16643 24.12402 24.09792 24.09411
meanSdPlot(dat.se)
meanSdPlot(dat.norm)
assay(dat.norm) %>%
as.data.frame() %>%
`colnames<-`(colnames(dat.order)) %>%
tidyr::pivot_longer(cols = everything(),
names_to = "sample", values_to = "value") %>%
mutate(region = unname(region)[match(sample, names(region))]) %>%
arrange(region) %>%
mutate(sample = factor(sample, levels = sample %>% unique())) %>%
ggplot(aes(value, fill = region)) +
geom_histogram(binwidth = 1) +
facet_wrap(~sample, ncol = 6) +
theme_bw()
## Warning: Removed 173901 rows containing non-finite outside the scale range
## (`stat_bin()`).
plot_normalization(dat.se, dat.norm)
dat.plot <- assay(dat.norm) %>%
rowMedians(na.rm = T) %>%
data.frame(dat.norm %>% rownames()) %>%
`colnames<-`(c("med", "gene"))
dat.highlight <- dat.plot %>%
filter(med >= 25)
dat.plot %>%
ggplot(aes(gene, med)) +
geom_point() +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
geom_label_repel(data = dat.highlight, aes(gene, med, label = gene)) +
geom_point(data = dat.highlight, aes(gene, med), col = "red") +
labs(y = "Median normalised intensity")
dat.plot <- assay(dat.norm) %>%
as.data.frame() %>%
`colnames<-`(colnames(dat.order)) %>%
tidyr::pivot_longer(cols = everything(),
names_to = "sample", values_to = "value") %>%
filter(is.na(value)) %>%
mutate(region = unname(region)[match(sample, names(region))]) %>%
arrange(region) %>%
mutate(sample = factor(sample, levels = sample %>% unique()))
dat.plot[is.na(dat.plot)] <- 1
dat.plot %>%
ggplot(aes(sample, value, fill = region)) +
geom_histogram(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_blank()) +
labs(title = "Number of NAs per sample")
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
plot_missval(dat.norm)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 rows. You can control `use_raster` argument by explicitly setting
## TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
plot_detect(dat.norm)
We need to impute per tissue
expDesign %<>% mutate(id = colnames(dat.norm))
dat.norm.list <- dat.norm %>%
assay() %>%
as.data.frame() %>%
split.default(.,
colnames(.) %>%
strsplit("_") %>%
sget(1))
dat.imp.list <- dat.norm.list %>%
names() %>%
lapply(\(nn) {
tmp.expDesign <- expDesign %>%
filter(condition == nn)
tmp.df <- dat.norm.list[[nn]] %>%
`colnames<-`(tmp.expDesign$label) %>%
mutate(name = rownames(dat.norm),
ID = tmp$ensembl_gene_id[match(rownames(dat.norm), tmp$hgnc_symbol)])
out <- DEP::make_se(tmp.df, seq(nrow(tmp.expDesign)), tmp.expDesign)
assay(out) %<>% 2^.
return(out)
}) %>%
setNames(names(dat.norm.list)) %>%
lapply(DEP::impute, fun = "MinProb")
## Imputing along margin 2 (samples/columns).
## [1] 0.5338544
## Imputing along margin 2 (samples/columns).
## [1] 0.4457345
## Imputing along margin 2 (samples/columns).
## [1] 0.5671568
## Imputing along margin 2 (samples/columns).
## [1] 0.5265539
## Imputing along margin 2 (samples/columns).
## [1] 0.5409824
dat.imp.list.tmp <- list()
for (nn in names(dat.imp.list)) {
tmp2 <- dat.imp.list[[nn]]
rowData(tmp2) %<>%
`colnames<-`(paste(nn, colnames(.), sep = "_"))
dat.imp.list.tmp[[nn]] <- tmp2
}
dat.imp <- do.call(cbind, dat.imp.list.tmp) %>%
.[, match(expDesign$id, colnames(.))]
plot_imputation(dat.norm, dat.imp)
assay(dat.imp) %>%
as.data.frame() %>%
`colnames<-`(colnames(dat.order)) %>%
tidyr::pivot_longer(cols = everything(),
names_to = "sample", values_to = "value") %>%
mutate(region = unname(region)[match(sample, names(region))]) %>%
arrange(region) %>%
mutate(sample = factor(sample, levels = sample %>% unique())) %>%
ggplot(aes(value, fill = region)) +
geom_histogram(binwidth = 1) +
facet_wrap(~sample, ncol = 6) +
theme_bw()
Here, I’m using raw data
# Also check raw values
repr <- df.complete %$%
table(intern, area) %>%
as.data.frame() %>%
filter(Freq > 1) %>%
mutate(sel = paste(intern, area, sep = "-"))
sampleIds <- df.final %>%
mutate(sel = paste(intern, area, sep = "-")) %>%
filter(sel %in% repr$sel) %>%
dplyr::select(id, sel) %>%
arrange(sel) %$%
split(id, sel)
df.tmp <- assay(dat.se) %>% # Raw data, otherwise use dat.norm
as.data.frame() %>%
`colnames<-`(colnames(dat.order))
corr <- sampleIds %>%
lapply(\(x) df.tmp[, x]) %>%
lapply(\(x) x[apply(x, 1, \(y) if (any(is.na(y))) F else T), ]) %>%
sapply(\(x) cor(x[, 1], x[, 2], method = "spearman")) %>%
formatC(digits = 3)
sampleIds %>%
lapply(\(x) df.tmp[, x]) %>%
{Map(\(z, nn, cc) plot(x = z[, 1], y = z[, 2], main = paste(nn, cc, sep = ", Spearman rho ")), z = ., nn = names(.), cc = unname(corr))}
## $`1381-CER`
## NULL
##
## $`1381-SN`
## NULL
##
## $`1381-STR`
## NULL
##
## $`1392-CER`
## NULL
##
## $`1392-SN`
## NULL
##
## $`1394-CER`
## NULL
##
## $`1394-SN`
## NULL
##
## $`1394-STR`
## NULL
##
## $`1398-CER`
## NULL
##
## $`1407-CER`
## NULL
##
## $`1446-CER`
## NULL
##
## $`1446-FC`
## NULL
##
## $`1446-SN`
## NULL
##
## $`1446-STR`
## NULL
##
## $`1462-CER`
## NULL
##
## $`1464-CER`
## NULL
##
## $`1464-SN`
## NULL
##
## $`1464-STR`
## NULL
corr %>%
{data.frame(cor = unname(.) %>% as.numeric(), sample = names(.))} %>%
mutate(area = strsplit(sample, "-") %>% sget(2)) %>%
ggplot(aes(area, cor, col = area)) +
geom_jitter(width = 0.2) +
theme_bw() +
ylim(c(0, 1)) +
labs(y = "Spearman's r", x = "", col = "Area")
annot_colors <- list(
Area = c(CER="red",
FC="blue",
MED = "purple",
# PFC = "lightblue",
SN = "green3",
STR = "yellow3",
PUT = "orange"))
cnames <- dat.imp %>%
colnames()
annot <- cnames %>%
strsplit("_") %>%
sget(1) %>%
as.data.frame() %>%
`dimnames<-`(list(cnames, "Area"))
set.seed(1337)
assay(dat.imp) %>%
as.data.frame() %>%
pheatmap(annotation_col = annot,
show_rownames = F,
color=colorRampPalette(c("navy", "white", "red"))(50),
annotation_colors = annot_colors)
d <- dist(assay(dat.imp) %>% t())
dd <- data.matrix(d) %>% `dimnames<-`(list(cnames,cnames))
pheatmap(dd,
annotation_col = annot,
show_rownames = T,
color=colorRampPalette(c("navy", "white", "red"))(50),
annotation_colors = annot_colors)
Please disregard the last plot.
annot_colors <- list(
Area = c(CER="red",
FC="blue",
MED = "purple",
SN = "green3",
STR = "yellow3"))
set.seed(1337)
split(expDesign$label, expDesign$condition) %>%
append(., .[4]) %>%
lapply(\(area) {
dat.sub <- assay(dat.imp) %>%
as.data.frame() %>%
`colnames<-`(expDesign$label[match(colnames(.), expDesign$id)]) %>%
.[, area] %>%
`colnames<-`(expDesign$id[match(colnames(.), expDesign$label)])
cnames <- dat.sub %>%
colnames()
annot <- cnames %>%
strsplit("_") %>%
sget(1) %>%
data.frame() %>%
`dimnames<-`(list(cnames, "Area"))
pheatmap(mat = dat.sub,
annotation_col = annot,
show_rownames = F,
color=colorRampPalette(c("navy", "firebrick"))(50),
annotation_colors = annot_colors)
})
## $CER
##
## $FC
##
## $MED
##
## $SN
##
## $STR
##
## $SN
Per area
region %>%
unique() %>%
lapply(\(area) {
pc.res <- assay(dat.imp) %>%
.[, grepl(pattern = area, colnames(.))] %>%
as.data.frame() %>%
t() %>%
prcomp(center = T,
scale = T)
dat.plot <- pc.res$x %>%
data.frame() %>%
mutate(.,
region = rownames(.) %>% strsplit("_") %>% sget(1),
var = pc.res$sdev^2 / sum(pc.res$sdev^2),
csum = cumsum(pc.res$sdev^2)/sum(pc.res$sdev^2),
id = rownames(.))
plot_grid(plotlist = list(
dat.plot %>%
ggplot(aes(PC1, PC2, col = region)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC1, PC3, col = region)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC2, PC3, col = region)) +
geom_point() +
theme_bw()
), ncol = 1)
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
Prepare
pc.res <- assay(dat.imp) %>%
as.data.frame() %>%
`colnames<-`(expDesign$label) %>%
t() %>%
prcomp(center = F,
scale = F)
dat.plot <- pc.res$x %>%
data.frame() %>%
mutate(.,
region = unname(region)[match(rownames(.), names(region))],
var = pc.res$sdev^2 / sum(pc.res$sdev^2),
csum = cumsum(pc.res$sdev^2)/sum(pc.res$sdev^2),
id = rownames(.)) %>%
mutate(sample = df.final$intern[match(.$id, df.final$id)],
diagnosis = df.final$Diagnosis[match(.$id, df.final$id)])
Plot top PCs, colour by region
plot_grid(plotlist = list(
dat.plot %>%
ggplot(aes(PC1, PC2, col = region)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC1, PC3, col = region)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC2, PC3, col = region)) +
geom_point() +
theme_bw()
), ncol = 1)
Plot top PCs, colour by diagnosis
plot_grid(plotlist = list(
dat.plot %>%
ggplot(aes(PC1, PC2, col = diagnosis)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC1, PC3, col = diagnosis)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC2, PC3, col = diagnosis)) +
geom_point() +
theme_bw()
), ncol = 1)
dat.plot %>%
ggplot(aes(region, fill = diagnosis)) +
geom_bar(position = position_dodge()) +
facet_wrap(~region, scales = "free_x") +
labs(title = "Number of samples per area per diagnosis")
Scree plot
We should consider PCs that explain more than 5% variation
n.pcs = 10
dat.plot[1:n.pcs, ] %>%
ggplot(aes(seq(n.pcs), var)) +
geom_point() +
theme_bw() +
geom_hline(yintercept = 0.05, col = "red")
However, we should also include enough PCs to control more than 80% total variance
n.pcs = 200
dat.plot[1:n.pcs, ] %>%
ggplot(aes(seq(n.pcs), csum)) +
geom_point() +
geom_hline(yintercept = 0.8, col = "red") +
theme_bw() +
labs(y = "Cumulative variance explained")
3D plot of top PCs. Use the mouse to move around the models
Per region
plotly::plot_ly(data = dat.plot, x = ~PC1, y = ~PC2, z = ~PC3, color = ~region)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Per diagnosis
plotly::plot_ly(data = dat.plot, x = ~PC1, y = ~PC2, z = ~PC3, color = ~diagnosis)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
tsne.df <- assay(dat.imp) %>%
t() %>%
Rtsne::Rtsne(perplexity = 15, normalize = F) %>%
{as.data.frame(.$Y)} %>%
`colnames<-`(c("X", "Y"))
plot_grid(plotlist = list(
tsne.df %>%
mutate(area = df.final$area[match(colnames(dat.order), df.final$id)]) %>%
ggplot(aes(X, Y, col = area)) +
geom_point() +
theme_dark() +
labs(x = "tSNE1", y = "tSNE2") +
scale_color_manual(values = ggsci::pal_jama()(5)),
tsne.df %>%
mutate(diagnosis = df.final$Diagnosis[match(colnames(dat.order), df.final$id)]) %>%
ggplot(aes(X, Y, col = diagnosis)) +
geom_point() +
theme_dark() +
labs(x = "tSNE1", y = "tSNE2") +
scale_color_manual(values = ggsci::pal_aaas()(5))
))
expDesign %<>%
mutate(comb = paste(intern, condition, sep ="-") %>%
makeunique::make_unique(sep = ".", wrap_in_brackets = F))
doublets <- expDesign %>%
pull(comb) %>%
{expDesign$id[grepl(".2", ., fixed = T)]}
outliers <- c("1381-MED", "1469-FC", "1467-FC", "1469-CER", "1381-CER.1", "1385-CER", "1410-CER", "1381-FC", "1469-MED", "1385-MED", "1467-SN", "1490-SN", "1381-CER.1", "1467-STR", "1398-STR") %>% # "1446-FC.2"?
{expDesign$id[match(., expDesign$cnames)]}
remIds <- c(doublets, outliers)
dat.clean <- dat.imp %>%
.[, !colnames(.) %in% remIds]
expDesign.clean <- expDesign %>%
filter(!id %in% remIds) %$%
.[match(colnames(dat.clean), id), ]
cnames.order <- expDesign.clean$label[match(colnames(dat.clean), expDesign.clean$id)]
annot_colors <- list(
Area = c(CER="red",
FC="blue",
MED = "purple",
# PFC = "lightblue",
SN = "green3",
STR = "yellow3",
PUT = "orange",
doublet = "grey",
outlier = "black"))
cnames <- dat.imp %>%
colnames()
annot <- cnames %>%
strsplit("_") %>%
sget(1) %>%
as.data.frame() %>%
`dimnames<-`(list(cnames, "Area"))
annot$Area[rownames(annot) %in% doublets] <- "doublet"
annot$Area[rownames(annot) %in% outliers] <- "outlier"
set.seed(1337)
assay(dat.imp) %>%
as.data.frame() %>%
pheatmap(annotation_col = annot,
show_rownames = F,
color=colorRampPalette(c("navy", "red"))(200),
annotation_colors = annot_colors)
d <- dist(assay(dat.imp) %>% t())
dd <- data.matrix(d) %>% `dimnames<-`(list(cnames,cnames))
pheatmap(dd,
annotation_col = annot,
show_rownames = T,
color=colorRampPalette(c("navy", "white", "red"))(50),
annotation_colors = annot_colors)
Please disregard the last plot.
split(expDesign$label, expDesign$condition) %>%
append(., .[4]) %>%
lapply(\(area) {
dat.sub <- assay(dat.imp) %>%
as.data.frame() %>%
`colnames<-`(expDesign$label)%>%
.[, area]
cnames <- expDesign %>%
.[match(colnames(dat.sub), .$label), ] %>%
mutate(cnames = paste(intern, condition, sep ="-") %>%
makeunique::make_unique(sep = ".", wrap_in_brackets = F)) %>%
pull(cnames)
annot <- expDesign$condition[expDesign$label %in% colnames(dat.sub)] %>%
data.frame() %>%
`dimnames<-`(list(cnames, "Area"))
annot$Area[rownames(annot) %in% doublets] <- "doublet"
annot$Area[rownames(annot) %in% outliers] <- "outlier"
pheatmap(mat = dat.sub %>% `colnames<-`(cnames),
annotation_col = annot,
show_rownames = F,
color=colorRampPalette(c("navy", "firebrick"))(50),
annotation_colors = annot_colors)
})
## $CER
##
## $FC
##
## $MED
##
## $SN
##
## $STR
##
## $SN
annot_colors <- list(
Area = c(CER="red",
FC="blue",
MED = "purple",
SN = "green3",
STR = "yellow3"))
cnames <- dat.clean %>%
colnames()
annot <- cnames %>%
strsplit("_") %>%
sget(1) %>%
as.data.frame() %>%
`dimnames<-`(list(cnames, "Area"))
set.seed(1337)
assay(dat.clean) %>%
as.data.frame() %>%
pheatmap(annotation_col = annot,
show_rownames = F,
color=colorRampPalette(c("navy", "white", "red"))(50),
annotation_colors = annot_colors)
d <- dist(assay(dat.clean) %>% t())
dd <- data.matrix(d) %>% `dimnames<-`(list(cnames,cnames))
pheatmap(dd,
annotation_col = annot,
show_rownames = T,
color=colorRampPalette(c("navy", "white", "red"))(50),
annotation_colors = annot_colors)
Please disregard the last plot.
annot_colors <- list(
Area = c(CER="red",
FC="blue",
MED = "purple",
SN = "green3",
STR = "yellow3"))
set.seed(1337)
split(expDesign.clean$id, expDesign.clean$condition) %>%
append(., .[4]) %>%
lapply(\(area) {
dat.sub <- assay(dat.clean) %>%
as.data.frame() %>%
.[, area]
cnames <- dat.sub %>%
colnames()
annot <- cnames %>%
strsplit("_") %>%
sget(1) %>%
as.data.frame() %>%
`dimnames<-`(list(cnames, "Area"))
pheatmap(mat = dat.sub,
annotation_col = annot,
show_rownames = F,
color=colorRampPalette(c("navy", "white", "red"))(50),
annotation_colors = annot_colors)
})
## $CER
##
## $FC
##
## $MED
##
## $SN
##
## $STR
##
## $SN
Prepare
pc.res <-
assay(dat.clean) %>%
as.data.frame() %>%
t() %>%
prcomp(center = T,
scale = T)
dat.plot <- pc.res$x %>%
data.frame() %>%
mutate(.,
region = expDesign.clean$condition[match(rownames(.), expDesign.clean$id)],
var = pc.res$sdev^2 / sum(pc.res$sdev^2),
csum = cumsum(pc.res$sdev^2)/sum(pc.res$sdev^2),
id = rownames(.)) %>%
mutate(sample = expDesign.clean$intern[match((rownames(.)), expDesign.clean$id)],
diagnosis = expDesign.clean$Diagnosis[match(rownames(.), expDesign.clean$id)])
dat.plot %>%
ggplot(aes(region, fill = diagnosis)) +
geom_bar(position = position_dodge()) +
facet_wrap(~region, scales = "free_x") +
labs(title = "Number of samples per area per diagnosis")
Scree plot
We should consider PCs that explain more than 5% variation
n.pcs = 10
dat.plot[1:n.pcs, ] %>%
ggplot(aes(seq(n.pcs), var)) +
geom_point() +
theme_bw() +
geom_hline(yintercept = 0.05, col = "red")
However, we should also include enough PCs to control more than 80% total variance I’m not sure what to do here
n.pcs = 200
dat.plot[1:n.pcs, ] %>%
ggplot(aes(seq(n.pcs), csum)) +
geom_point() +
geom_hline(yintercept = 0.8, col = "red") +
theme_bw() +
labs(y = "Cumulative variance explained")
## Warning: Removed 13 rows containing missing values or values outside the scale range
## (`geom_point()`).
Plot top PCs, colour by region
plot_grid(plotlist = list(
dat.plot %>%
ggplot(aes(PC1, PC2, col = region)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC1, PC3, col = region)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC2, PC3, col = region)) +
geom_point() +
theme_bw()
), ncol = 1)
Plot top PCs, colour by diagnosis
plot_grid(plotlist = list(
dat.plot %>%
ggplot(aes(PC1, PC2, col = diagnosis)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC1, PC3, col = diagnosis)) +
geom_point() +
theme_bw(),
dat.plot %>%
ggplot(aes(PC2, PC3, col = diagnosis)) +
geom_point() +
theme_bw()
), ncol = 1)
3D plot of top PCs. Use the mouse to move around the models
Per region
plotly::plot_ly(data = dat.plot, x = ~PC1, y = ~PC2, z = ~PC3, color = ~region)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Per diagnosis
plotly::plot_ly(data = dat.plot, x = ~PC1, y = ~PC2, z = ~PC3, color = ~diagnosis)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
tsne.df <- assay(dat.clean) %>%
t() %>%
Rtsne::Rtsne(perplexity = 15, normalize = F) %>%
{as.data.frame(.$Y)} %>%
`dimnames<-`(list(colnames(dat.clean), c("X", "Y")))
plot_grid(plotlist = list(
tsne.df %>%
mutate(area = expDesign.clean$condition[match(rownames(.), expDesign.clean$id)]) %>%
ggplot(aes(X, Y, col = area)) +
geom_point() +
theme_dark() +
labs(x = "tSNE1", y = "tSNE2") +
scale_color_manual(values = ggsci::pal_jama()(5)),
tsne.df %>%
mutate(diagnosis = expDesign.clean$Diagnosis[match(rownames(.), expDesign.clean$id)]) %>%
ggplot(aes(X, Y, col = diagnosis)) +
geom_point() +
theme_dark() +
labs(x = "tSNE1", y = "tSNE2") +
scale_color_manual(values = ggsci::pal_aaas()(5))
))
list(
dat.clean = dat.clean,
dat.norm = dat.norm,
dat.norm.noout = dat.norm %>% .[, !colnames(.) %in% remIds],
expDesign.clean = expDesign.clean,
expDesign = expDesign,
universe = universe
) %>%
qsave("/work/proteomics/02_data/03_objects/preprocessed_data.qs")
tt - Sys.time()
## Time difference of -5.732775 mins
sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /opt/intel/oneapi/mkl/2025.3/lib/libmkl_gf_lp64.so.2; LAPACK version 3.12.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Berlin
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] DEP_1.26.0 qs_0.27.3
## [3] SummarizedExperiment_1.38.1 Biobase_2.68.0
## [5] GenomicRanges_1.60.0 GenomeInfoDb_1.44.0
## [7] IRanges_2.42.0 S4Vectors_0.46.0
## [9] BiocGenerics_0.54.0 generics_0.1.4
## [11] MatrixGenerics_1.20.0 matrixStats_1.5.0
## [13] pheatmap_1.0.12 biomaRt_2.64.0
## [15] scHelper_0.0.5 ggpubr_0.6.0
## [17] ggrepel_0.9.6 cowplot_1.1.3
## [19] ggplot2_3.5.2 dplyr_1.1.4
## [21] magrittr_2.0.3
##
## loaded via a namespace (and not attached):
## [1] later_1.4.2 norm_1.0-11.1
## [3] filelock_1.0.3 tibble_3.2.1
## [5] makeunique_1.0.0 preprocessCore_1.70.0
## [7] XML_3.99-0.18 lifecycle_1.0.4
## [9] httr2_1.2.1 rstatix_0.7.2
## [11] doParallel_1.0.17 lattice_0.22-7
## [13] MASS_7.3-65 crosstalk_1.2.1
## [15] MultiAssayExperiment_1.34.0 backports_1.5.0
## [17] plotly_4.10.4 limma_3.64.0
## [19] sass_0.4.10 rmarkdown_2.29
## [21] jquerylib_0.1.4 yaml_2.3.10
## [23] httpuv_1.6.16 MsCoreUtils_1.20.0
## [25] conos_1.5.2 DBI_1.2.3
## [27] RColorBrewer_1.1-3 abind_1.4-8
## [29] Rtsne_0.17 purrr_1.0.4
## [31] AnnotationFilter_1.32.0 rappdirs_0.3.3
## [33] sandwich_3.1-1 circlize_0.4.16
## [35] GenomeInfoDbData_1.2.14 MSnbase_2.30.1
## [37] ncdf4_1.24 codetools_0.2-20
## [39] DelayedArray_0.34.1 DT_0.33
## [41] xml2_1.4.0 RApiSerialize_0.1.4
## [43] tidyselect_1.2.1 gmm_1.8
## [45] Spectra_1.18.0 shape_1.4.6.1
## [47] UCSC.utils_1.4.0 farver_2.1.2
## [49] BiocFileCache_2.16.0 jsonlite_2.0.0
## [51] GetoptLong_1.0.5 Formula_1.2-5
## [53] iterators_1.0.14 foreach_1.5.2
## [55] tools_4.5.2 progress_1.2.3
## [57] Rcpp_1.0.14 glue_1.8.0
## [59] gridExtra_2.3 SparseArray_1.8.0
## [61] BiocBaseUtils_1.10.0 xfun_0.52
## [63] shinydashboard_0.7.3 withr_3.0.2
## [65] BiocManager_1.30.25 fastmap_1.2.0
## [67] digest_0.6.37 mime_0.13
## [69] R6_2.6.1 imputeLCMD_2.1
## [71] colorspace_2.1-1 Cairo_1.6-2
## [73] sccore_1.0.5 dichromat_2.0-0.1
## [75] RSQLite_2.3.11 hexbin_1.28.5
## [77] ggsci_3.2.0 tidyr_1.3.1
## [79] data.table_1.17.2 htmlwidgets_1.6.4
## [81] prettyunits_1.2.0 PSMatch_1.12.0
## [83] httr_1.4.7 S4Arrays_1.8.0
## [85] pkgconfig_2.0.3 gtable_0.3.6
## [87] blob_1.2.4 ComplexHeatmap_2.24.0
## [89] impute_1.82.0 XVector_0.48.0
## [91] htmltools_0.5.8.1 carData_3.0-5
## [93] MALDIquant_1.22.3 ProtGenerics_1.40.0
## [95] clue_0.3-66 scales_1.4.0
## [97] tmvtnorm_1.6 png_0.1-8
## [99] knitr_1.50 MetaboCoreUtils_1.16.1
## [101] rstudioapi_0.17.1 tzdb_0.5.0
## [103] reshape2_1.4.4 rjson_0.2.23
## [105] curl_7.0.0 cachem_1.1.0
## [107] zoo_1.8-14 GlobalOptions_0.1.2
## [109] stringr_1.5.1 parallel_4.5.2
## [111] AnnotationDbi_1.70.0 mzID_1.46.0
## [113] vsn_3.76.0 pillar_1.10.2
## [115] grid_4.5.2 vctrs_0.6.5
## [117] pcaMethods_2.0.0 promises_1.3.2
## [119] car_3.1-3 stringfish_0.16.0
## [121] dbplyr_2.5.1 xtable_1.8-4
## [123] cluster_2.1.8.1 evaluate_1.0.3
## [125] magick_2.8.6 readr_2.1.5
## [127] mvtnorm_1.3-3 cli_3.6.5
## [129] compiler_4.5.2 rlang_1.1.6
## [131] crayon_1.5.3 ggsignif_0.6.4
## [133] labeling_0.4.3 QFeatures_1.18.0
## [135] affy_1.86.0 plyr_1.8.9
## [137] fs_1.6.6 stringi_1.8.7
## [139] viridisLite_0.4.2 BiocParallel_1.42.0
## [141] assertthat_0.2.1 Biostrings_2.76.0
## [143] lazyeval_0.2.2 Matrix_1.7-3
## [145] hms_1.1.3 bit64_4.6.0-1
## [147] shiny_1.10.0 KEGGREST_1.48.0
## [149] statmod_1.5.0 mzR_2.38.0
## [151] leidenAlg_1.1.5 igraph_2.1.4
## [153] broom_1.0.8 memoise_2.0.1
## [155] RcppParallel_5.1.10 affyio_1.78.0
## [157] bslib_0.9.0 bit_4.6.0